if (!require("pacman")) install.packages("pacman")# use this line for installing/loading pacman::p_load(tidyverse, palmerpenguins, here, cowgrid) install.packages("openintro")
package 'openintro' successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\matto\AppData\Local\Temp\RtmpqCTqRe\downloaded_packages
accidents_data <-read_csv(here("data", "accidents.csv"))##glimpse(accidents_data) #Quick glimpse of the data#Converts day of week (monday, friday, etc.) to a weekend or weekdayaccidents_data <- accidents_data |>mutate(weekday_or_weekend =ifelse(day_of_week %in%c("Saturday", "Sunday"), "Weekend", "Weekday"))#Density plotggplot(accidents_data, aes(x = time, fill = severity)) +geom_density(alpha =0.5, adjust =1) +facet_wrap(~weekday_or_weekend, nrow=2) +scale_fill_manual(values =c("Fatal"="#9370DB", "Serious"="#87CEEB", "Slight"="#FFFFE0")) +theme_minimal() +labs(x ="Time of Day",y ="Density",title ="Number of Accidents throughout the day\nBy day of week and severity" )
2 - NYC marathon winners
library("cowplot") #I used this library for plot_grid() to stack the plots.nyc_marathon_data <-read_csv(here("data", "nyc_marathon.csv"))#Quick glimpse of the data#glimpse(nyc_marathon_data) #Code for the histogramhistogram =ggplot(nyc_marathon_data, aes(x = time)) +geom_histogram(binwidth =75)+labs(x ="Marathon Completion Time" )#Code for the boxplot, removing the extra axis info so it looks cleaner stacked with histogramboxplot =ggplot(nyc_marathon_data, aes(x = time)) +geom_boxplot()+labs(title ="A. Boxplot and Histogram")+theme_minimal(base_size =14)+theme(axis.line=element_blank(),axis.ticks=element_blank(),axis.title.y=element_blank(),axis.title.x=element_blank(),axis.text.y=element_blank())#This function stacks the boxplot and histogram on top of each otherplot_grid(boxplot, histogram, align ="v", axis ="l", ncol =1, heights =c(1, 3))
#Answer to Part A: The histogram gives a better visual of the distribution of runners that completed the marathon. We can see a more clear visual represenatation of the completion times. The boxplot, however, gives us a more clear indication of the median points, outliers, and quartiles. The boxplot lacks the visualization of the distrubution, while the histogram lacks the statistical information such as median, outliers, and quartiles.#Part B Codeggplot(nyc_marathon_data, aes(x = time_hrs, y = division, fill = division)) +geom_boxplot()+scale_fill_manual(values =c("Men"="Blue", "Women"="Red"))+theme_minimal(base_size =14)+labs(title ="Men vs Women Marathon Times",x ="Marathon Time by Hour",y ="Division by Gender")
#Answer to Part B: Compared to women, men have a distribution range of marathon times. Men also have a smaller quartile range, and on average, a faster completion time. Women have more outlier points.#Part C Codeggplot(nyc_marathon_data, aes(x = time_hrs, y = division, fill = division)) +geom_boxplot(show.legend =FALSE,fill ="gray")+theme(axis.title.y=element_blank())+theme_minimal(base_size =14)+labs(title ="Men vs Women Marathon Times",x ="Marathon Time by Hour" )
#Answer to Part C: The separation by Gender already speaks for itself a bit. I removed the legend and y-axis label. I also decided to remove the color and just use a default gray fill, although I suppose I could have kept the blue vs red/pink in. Those are sort of historical default "gender" color separators. My changes affect the data-to-ink ratio by cutting down a lot of excess colors and information in order to simplyify the visual.#Part D Codeggplot(nyc_marathon_data, aes(x = year, y = time_hrs, color = division, shape = division))+geom_point()+geom_line(aes(group = division))+##necessary to connect the pointsscale_color_manual(values =c("Men"="Blue", "Women"="Red"))+labs(title ="Marathon Times by Year",x ="Year",y ="Marathon Time",color ="Division",shape ="Division")+theme_minimal(base_size =14)
#Answer to Part D: What is more visible in this plot compared to the others is the trend. We can visualize the marathon times by year, especially noting the marathon times improving significantly since 1970.
3 - US counties
#Setup#This filters out the N/A values to remove any warning messagescounty <- countyfilter(county,!is.na(homeownership), !is.na(poverty))
#Part A: The following code attempts to create a boxplot with categorical data of smoking ban in the x-axis and numerical pop in the y-axis. It also tries to layer this with a scatterplot of numerical median income data, and categorical median education data. This code will output a plot but it does not really work correctly nor does it make sense. First off, geom_point will create a scatterplot, which is expecting two continuous variables. median_edu is not a continuous variable, so this will cause issues. I think the boxplot is fine, but combining it with a scatterplot does not make sense and will lead to a confusing graphic. suppressWarnings(ggplot(county) +geom_point(aes(x = median_edu, y = median_hh_income))+geom_boxplot(aes(x = smoking_ban, y = pop2017)) )
#Part B: The second plot (the one where the median_edu columns are stacked vertically) makes it much easier to compare. The spacing is better which makes it easier to read. It's also easier to read the charts up from down rather than side to side. This means the placement of the faceting variable is dependant on what we want to compare. In this case, we want to see the poverty levels across people from different median education levels, so placing the median education levels in vertical columns makes the most sense.suppressWarnings(ggplot(county %>%filter(!is.na(median_edu))) +geom_point(aes(x = homeownership, y = poverty)) +facet_grid(median_edu ~ .))
#Plot B:ggplot(county, aes(x = homeownership, y = poverty))+geom_point(size =2)+geom_smooth(method ="lm", formula = y ~poly(x, 2), color ="blue", se =FALSE)+theme_gray()+labs(title ="Plot B",x ="homeownership",y ="poverty" )
#Plot C:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_point()+geom_smooth(se =FALSE, color ="green")+labs(title ="Plot C",x ="homeownership",y ="poverty" )
#Plot D:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_smooth(se =FALSE, color ="blue")+geom_point()+labs(title ="Plot D",x ="homeownership",y ="poverty" )
#Plot E:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_point(aes(color = metro), size =2)+geom_smooth(aes(linetype = metro), se =FALSE)+scale_linetype_manual(values =c("no"="solid", "yes"="dashed"))+theme_gray()+labs(title ="Plot E",x ="homeownership",y ="poverty" )
#Plot F:ggplot(county, aes(x = homeownership, y = poverty, line = metro))+geom_point(aes(color = metro), size =2)+geom_smooth(aes(color = metro), se =FALSE)+theme_gray()+labs(title ="Plot F",x ="homeownership",y ="poverty" )
#Plot G:ggplot(county, aes(x = homeownership, y = poverty))+geom_point(aes(color = metro), size =2)+geom_smooth(method ="lm", formula = y ~poly(x, 2), color ="blue", se =FALSE)+theme_gray()+labs(title ="Plot G",x ="homeownership",y ="poverty" )
credit_data <-read_csv(here("data", "credit.csv"))#Glimpse of Data#glimpse(credit_data)#Part A: There appears to be a relationship between credit card balance and income. Generally, higher income means credit card balance will also be higher. This relationship varies a little depending on the student/married factors. Let's start with students. Students tend to have a higher credit card balance than non-students, so that shifts the starting point of the slope. Students also appear to have a slightly flatter slope. Non-married has a little steeper slope compared to married, but married income caps out higher.#Note: I used this website to figure out how to label the axis: https://datavizpyr.com/dollar-format-for-axis-labels-with-ggplot2/ggplot(credit_data, aes(x = income, y = balance, color = student, shape = student))+geom_point(size =2, alpha = .5)+geom_smooth(method ="lm", aes(color = student), se =FALSE)+theme_gray()+theme(legend.position ="none")+facet_grid(rows =vars(student), cols =vars (married), label =labeller(student =c("No"="student: No", "Yes"="student: Yes"),married =c("No"="married: No", "Yes"="married: Yes"))) +labs(x ="Income",y ="Credit Card Balance" )+scale_x_continuous(labels = scales::dollar_format(scale =1000))+scale_y_continuous(labels = scales::dollar_format())
#Part B: I think that these factors are useful indicators. For example, I would expect a student to have a higher credit card balance compared to a non-student. I would also expect married income to cap out much higher compared to non-married incomes.#Part C#First, make the new column and calculate the variablecredit_data$credit_utilization = credit_data$balance / credit_data$limitggplot(credit_data, aes(x = income, y = credit_utilization, color = student, shape = student))+geom_point(size =2, alpha = .5)+geom_smooth(method ="lm", aes(color = student), se =FALSE)+theme_gray()+theme(legend.position ="none")+facet_grid(rows =vars(student), cols =vars (married), label =labeller(student =c("No"="student: No", "Yes"="student: Yes"),married =c("No"="married: No", "Yes"="married: Yes"))) +labs(x ="Income",y ="Credit Utilization" )+scale_x_continuous(labels = scales::dollar_format(scale =1000))+scale_y_continuous(labels = scales::percent_format())
#Part D: This new plot implies that credit utilization is affected by marital status and student status, not simply just income. Students tend to use their credit more, even with higher levels of income. Non-students tend to utilize less credit, but slightly increase the use with higher income. Married credit use seems to be a lot more flat across the board compared to non-married credit, which has more variance but is also dependent on student vs non-student status.
5 - Napoleon’s march.
#I primarily used http://euclid.psych.yorku.ca/www/psy6135/tutorials/Minard.html#I followed their guide and made a few small tweaks. Each plot shown is the process that leads to the final one. Refer to the last plot for the finished product.#Loading all necessary packageslibrary(ggplot2)library(scales) # additional formatting for scaleslibrary(grid) # combining plotslibrary(gridExtra) # combining plotslibrary(dplyr) # tidy data manipulationsnapoleon_data <-read_rds(here("data", "napoleon.rds"))#Converting to data framesstr(napoleon_data)
troops_df <- napoleon_data$troopscities_df <- napoleon_data$citiestemp_df <- napoleon_data$temperatures#Initially plots the path of the surviving troops.#ggplot(troops_df, aes(long, lat))+#geom_path(aes(size = survivors))#I changed the colors a bit on this graphic.breaks <-c(1, 2, 3) *10^5#Sets our own break size scalingggplot(troops_df, aes(long, lat)) +geom_path(aes(size = survivors, colour = direction, group = group),lineend="round") +scale_size("Survivors", range =c(1,10), #c(0.5, 15),breaks=breaks, labels=scales::comma(breaks)) +scale_color_manual("Direction", values =c("#6ee78d", "#edd579"), labels=c("Advance", "Retreat"))
plot_troops <-last_plot() #New trick I learned. This plots the last graph ran.if (!require(ggrepel)) {install.packages("ggrepel"); require(ggrepel)} #package necessary for overlap library(ggrepel)plot_troops +geom_point(data = cities_df)+geom_text_repel(data = cities_df, aes(label = city)) #Ensures labels and points do not overlap
plot_troops_cities <-last_plot() #Saving this versionplot_troops_cities
#This code appends the ° to temperauretemp_df <- temp_df %>%mutate(label =paste0(temp, "° ", date))head(temp_df$label)
#Plots the temperature and long data. Like before, the text repel is so that points do not overlap.ggplot(temp_df, aes(long, temp)) +geom_path(color="grey", size=1.5) +geom_point(size=1) +geom_text_repel(aes(label=label), size=2.5)
plot_temp <-last_plot() #saves the plot temp#Places both grids vertically. Needs to be adjusted as done below.#grid.arrange(plot_troops_cities, plot_temp)#This code limits the x axis so that it matches temperature. It also removes the legends, color, and keeps the theme completely void.plot_troops_cities +coord_cartesian(xlim =c(24, 38)) +guides(color =FALSE, size =FALSE) +theme_void()
#New x1 value for grid.arrangeplot_troops_cities_fixed <-last_plot() #Removes the x axis value, keeps the theme simple, cleans a few elements, and matches the xlim value with the troops/cities graphplot_temp +coord_cartesian(xlim =c(24, 38)) +labs(x =NULL, y="Temperature") +theme_bw() +theme(panel.grid.major.x =element_blank(),panel.grid.minor.x =element_blank(),panel.grid.minor.y =element_blank(),axis.text.x =element_blank(), axis.ticks =element_blank(),panel.border =element_blank())
#New x2 value for grid.arrangeplot_temp_fixed <-last_plot()#Finally arranges the grid with our final, updated plots. I added a title because I think that titles are important to have so that the audience knows what they are seeing. I also changed the color scheme a bit. I did not like the black retreat with black city text, so I chnaged that. I also used green because I like to think of green as more of a "advancement" color.grid.arrange(plot_troops_cities_fixed, plot_temp_fixed, nrow=2, heights=c(3.5, 1.2),top ="Napoleon's March")grid.rect(width = .99, height = .99, gp =gpar(lwd =2, col ="gray", fill =NA))